Libraries

library(readr)
library(plotly)
library(ggplot2)
library(highcharter)
library(ngram)
library(png)
library(stringr)
library(dplyr)
library("tm")
library("wordcloud")
library(corrplot)
library(arules)
library(arulesViz)
library(colorspace)

Preparing dataset:

setwd("/Users/Apple/Documents/TaraFiles/University/term 8/Data Analysis/week 12/")

movie = read_delim("./movie/movies.dat",
                   delim = "::",col_names = c("MovieID",NA,"Title",NA,"Genres"))
movie = movie %>% select(1,3,5)
movie$MovieID=as.numeric(movie$MovieID)

tag = read_delim("./movie/tags.dat",
                 delim = "::",col_names = c("UserID",NA,"MovieID",NA,"Tag",NA,"Timestamp"))
tag = tag %>% select(1,3,5,7)
tag$MovieID=as.numeric(tag$MovieID)

rating = read_delim("./movie/ratings.dat",
                    delim = "::",col_names = c("UserID",NA,"MovieID",NA,"Rating",NA,"Timestamp"))
rating = rating %>% select(1,3,5,7)
rating$MovieID=as.numeric(rating$MovieID)
rating$Rating=as.numeric(rating$Rating)

1.

# popular

rating%>%select(MovieID,Rating)%>%group_by(MovieID)%>%
  summarise(meanRate=mean(Rating,na.rm = T))%>%arrange(-meanRate)->popularity
popularity$MovieID[1:5]->popularID
movie$Title[ which(movie$MovieID %in% popularID) ]
## [1] "Satan's Tango (Sátántangó) (1994)"    
## [2] "Shadows of Forgotten Ancestors (1964)"
## [3] "Fighting Elegy (Kenka erejii) (1966)"
# number of comments

rating%>%select(MovieID,Rating)%>%group_by(MovieID)%>%
  summarise(numComments=n())%>%arrange(-numComments)->num_of_Comments
num_of_Comments$MovieID[1:5]->num_of_CommentsID
movie$Title[ which(movie$MovieID %in% num_of_CommentsID) ]
## [1] "Pulp Fiction (1994)"              "Shawshank Redemption, The (1994)"
## [3] "Forrest Gump (1994)"              "Jurassic Park (1993)"            
## [5] "Silence of the Lambs, The (1991)"
# least popular

popularity%>% arrange(meanRate)-> L_pop
L_pop$MovieID[1:3]->L_popID
movie$Title[ which(movie$MovieID %in% L_popID) ]
## [1] "Besotted (2001)"     "Hi-Line, The (1999)"
# num of movie each year
movie%>%mutate(year=as.numeric(str_sub(Title,-5,-2)))->movie

movie%>%filter(!is.na(year))%>%group_by(year)%>%summarise(numMovie=n())->Movie_Year
Movie_Year=Movie_Year[-c(1:10),]

Movie_Year%>%hchart(type = "bar",hcaes(x = year, y = numMovie),name = "num of movie each year")%>%
  hc_title(text = "num of movie each year")
# fav genre each year
movie%>%mutate(Action=as.numeric(str_detect(Genres,"Action")),
               Adventure=as.numeric(str_detect(Genres,"Adventure")),
               Animation=as.numeric(str_detect(Genres,"Animation")),
               Children=as.numeric(str_detect(Genres,"Children")),
               Comedy=as.numeric(str_detect(Genres,"Comedy")),
               Crime=as.numeric(str_detect(Genres,"Crime")),
               Documentary=as.numeric(str_detect(Genres,"Documentary")),
               Drama=as.numeric(str_detect(Genres,"Drama")),
               Fantasy=as.numeric(str_detect(Genres,"Fantasy")),
               Film_Noir=as.numeric(str_detect(Genres,"Film-Noir")),
               Horror=as.numeric(str_detect(Genres,"Horror")),
               Mystery=as.numeric(str_detect(Genres,"Mystery")),
               Romance=as.numeric(str_detect(Genres,"Romance")),
               Sci_Fi=as.numeric(str_detect(Genres,"Sci-Fi")),
               Thriller=as.numeric(str_detect(Genres,"Thriller")),
               War=as.numeric(str_detect(Genres,"War")),
               Western=as.numeric(str_detect(Genres,"Western")))->movie_with_genres


right_join(popularity,movie_with_genres,by="MovieID")->genre_movie_rate 


genre_movie_rate%>%group_by(year)%>%na.omit()%>%summarise(Action=sum(Action*meanRate)/sum(Action),
                                               Adventure=sum(Adventure*meanRate)/sum(Adventure),
                                               Animation=sum(Animation*meanRate)/sum(Animation),
                                               Children=sum(Children*meanRate)/sum(Children),
                                               Comedy=sum(Comedy*meanRate)/sum(Comedy),
                                               Crime=sum(Crime*meanRate)/sum(Crime),
                                               Documentary=sum(Documentary*meanRate)/sum(Documentary),
                                               Drama=sum(Drama*meanRate)/sum(Drama),
                                               Fantasy=sum(Fantasy*meanRate)/sum(Fantasy),
                                               Film_Noir=sum(Film_Noir*meanRate)/sum(Film_Noir),
                                               Horror=sum(Horror*meanRate)/sum(Horror),
                                               Mystery=sum(Mystery*meanRate)/sum(Mystery),
                                               Romance=sum(Romance*meanRate)/sum(Romance),
                                               Sci_Fi=sum(Sci_Fi*meanRate)/sum(Sci_Fi),
                                               Thriller=sum(Thriller*meanRate)/sum(Thriller),
                                               War=sum(War*meanRate)/sum(War),
                                               Western=sum(Western*meanRate)/sum(Western))->movie_year_genres
movie_year_genres[is.na(movie_year_genres)] <-0

movie_year_genres[, "max"] <- do.call(pmax,movie_year_genres[ 2:17])
movie_year_genres$pop_genre=""
for (i in 1:dim(movie_year_genres)[1]) {
  loc=which.max(movie_year_genres[i,2:17])
  movie_year_genres$pop_genre[i]=names(movie_year_genres)[loc+1]
}


movie_year_genres%>%hchart(type = "bar",hcaes(x = year, y = max,name =pop_genre,group=pop_genre))%>%
  hc_title(text = "pop_genre of each year")%>%
  hc_subtitle("hold on the bar to see the name of the genre")
movie_year_genres%>%filter(year>1980) %>%hchart(type = "bar",hcaes(x = year, y = max,name =pop_genre,group=pop_genre))%>%
  hc_title(text = "pop_genre of each year (after 1980)")%>%hc_subtitle("hold on the bar to see the name of the genre")

***

2.

# num of movie per genre


genreName=c("Action","Adventure","Animation","Children's","Comedy","Crime",
            "Documentary","Drama","Fantasy","Film-Noir","Horror",
            "Mystery","Romance","Sci-Fi","Thriller","War","Western")

GenereInMovie=movie$Genres
strsplit(GenereInMovie,"\\|")%>%unlist()%>%table() %>% 
  as.data.frame(stringsAsFactors = F)->GenereInMovie
GenereInMovie=GenereInMovie[-c(1:19),] 
names(GenereInMovie)=c("Genre","Freq")
GenereInMovie%>%arrange(Freq)%>%hchart(type = "bar",hcaes(x = Genre, y = Freq),name = " num of movie of each genre")%>%
  hc_title(text = " num of movie of each genre")
####   نمودار همبستگی ژانرها 

movie_with_genres[,5:21]->m
m%>%na.omit()->m

res <- cor(m, method = "pearson", use = "complete.obs")
corrplot(res, type = "upper", order = "hclust", 
         tl.col = "black", tl.srt = 45)

# متوسط امتیاز به هر ژانر

genre_movie_rate%>%na.omit()%>%summarise(Action_=sum(Action*meanRate)/sum(Action),
                                                          Adventure_=sum(Adventure*meanRate)/sum(Adventure),
                                                          Animation_=sum(Animation*meanRate)/sum(Animation),
                                                          Children_=sum(Children*meanRate)/sum(Children),
                                                          Comedy_=sum(Comedy*meanRate)/sum(Comedy),
                                                          Crime_=sum(Crime*meanRate)/sum(Crime),
                                                          Documentary_=sum(Documentary*meanRate)/sum(Documentary),
                                                          Drama_=sum(Drama*meanRate)/sum(Drama),
                                                          Fantasy_=sum(Fantasy*meanRate)/sum(Fantasy),
                                                          Film_Noir_=sum(Film_Noir*meanRate)/sum(Film_Noir),
                                                          Horror_=sum(Horror*meanRate)/sum(Horror),
                                                          Mystery_=sum(Mystery*meanRate)/sum(Mystery),
                                                          Romance_=sum(Romance*meanRate)/sum(Romance),
                                                          Sci_Fi_=sum(Sci_Fi*meanRate)/sum(Sci_Fi),
                                                          Thriller_=sum(Thriller*meanRate)/sum(Thriller),
                                                          War_=sum(War*meanRate)/sum(War),
                                                          Western_=sum(Western*meanRate)/sum(Western))->ccc
rate=as.numeric(ccc[1,])
c4=data.frame(genreName=genreName,rate=rate)

c4%>%arrange(rate)%>%hchart(type = "bar",hcaes(x = genreName, y =rate,name =genreName,group=genreName))%>%
  hc_title(text = "mean rate of genres")
# دوران طلایی فیلم سازی


right_join(popularity,movie,by="MovieID")%>%group_by(year)%>%
  summarise(meanRateYear=mean(meanRate))->goldenAge
goldenAge=goldenAge[-c(1:10),]

goldenAge%>%na.omit()%>%arrange(-meanRateYear)->goldenAge

head(goldenAge,n=3)
## # A tibble: 3 x 2
##    year meanRateYear
##   <dbl>        <dbl>
## 1  1924         3.93
## 2  1916         3.79
## 3  1931         3.71

3.

movie%>%mutate(pureTitle=str_sub(Title,1,-7))%>%select(pureTitle)->movieTitle


movieTitle%>%str_replace_all("[[:punct:]]"," ") %>% 
  str_split(pattern = "\\s") %>% 
  unlist() %>% 
  str_to_lower() %>% 
  removeWords(., stopwords('en')) %>% 
  removeWords(., stopwords('fr')) %>%
  str_trim() %>% 
  table() %>% 
  as.data.frame(stringsAsFactors = F)->title_word
colnames(title_word) = c("word","count")
title_word = title_word %>% arrange(desc(count)) %>% filter(count>5)

title_word=title_word[-c(1:4,9,11,35,44,64,75,102),]
wordcloud(title_word$word,title_word$count,
          c(5,.3), random.order = FALSE, colors=brewer.pal(8, "Dark2"))


4.

# Q4

rating%>%filter(Rating>3)%>%select(MovieID,UserID)%>%
  group_by(UserID)%>%summarise(basket=concatenate(MovieID,collapse = ","))->User_Basket
basket = lapply(User_Basket$basket,FUN = function(x) strsplit(x,split = ",")[[1]])

grules = apriori(basket, parameter = list(support = 0.009,
                                             confidence = 0.25, minlen = 2))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##        0.25    0.1    1 none FALSE            TRUE       5   0.009      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 628 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[10472 item(s), 69816 transaction(s)] done [0.78s].
## sorting and recoding items ... [1750 item(s)] done [0.13s].
## creating transaction tree ... done [0.06s].
## checking subsets of size 1 2 3 done [18.13s].
## writing ... [11251266 rule(s)] done [0.95s].
## creating S4 object  ... done [2.29s].
inspect(grules[1:10])
##      lhs      rhs   support     confidence lift     count
## [1]  {140} => {62}  0.009381804 0.5165615  4.795141 655  
## [2]  {135} => {780} 0.009367480 0.6318841  3.329229 654  
## [3]  {257} => {161} 0.009410450 0.6021998  4.542263 657  
## [4]  {257} => {349} 0.009840151 0.6296975  4.026650 687  
## [5]  {257} => {150} 0.010112295 0.6471127  2.315557 706  
## [6]  {257} => {47}  0.009195600 0.5884510  2.315465 642  
## [7]  {257} => {457} 0.011573278 0.7406049  2.278202 808  
## [8]  {257} => {110} 0.010255529 0.6562786  1.999335 716  
## [9]  {257} => {356} 0.009797181 0.6269478  1.665246 684  
## [10] {257} => {318} 0.011272488 0.7213566  1.769704 787
#Castle in the Sky (1986)   6350
#Cast Away (2000)           4022
#No Country for Old Men (2007)    51372
#Memento (2000)             4226


movierules = subset(grules, lhs %pin% c("6350","4022","51372","4226"))
inspect(sort(movierules, by = "lift")[1:32])->mn
##      lhs            rhs    support     confidence lift      count
## [1]  {5618,6350} => {3000} 0.010398762 0.8066667  16.433685 726  
## [2]  {4993,6350} => {3000} 0.009510714 0.7923628  16.142282 664  
## [3]  {2571,6350} => {3000} 0.009081013 0.7788698  15.867398 634  
## [4]  {6350}      => {3000} 0.011344105 0.7600768  15.484540 792  
## [5]  {3000,6350} => {5618} 0.010398762 0.9166667  14.917949 726  
## [6]  {296,6350}  => {5618} 0.009195600 0.8991597  14.633038 642  
## [7]  {6350,7153} => {5618} 0.009911768 0.8975357  14.606608 692  
## [8]  {5952,6350} => {5618} 0.009840151 0.8910506  14.501069 687  
## [9]  {4993,6350} => {5618} 0.010670906 0.8890215  14.468047 745  
## [10] {1196,6350} => {5618} 0.009281540 0.8888889  14.465890 648  
## [11] {2959,6350} => {5618} 0.009066690 0.8853147  14.407723 633  
## [12] {2571,6350} => {5618} 0.010255529 0.8796069  14.314833 716  
## [13] {260,6350}  => {5618} 0.009195600 0.8770492  14.273209 642  
## [14] {6350}      => {5618} 0.012891028 0.8637236  14.056347 900  
## [15] {5952,6350} => {7153} 0.010026355 0.9079118   5.938983 700  
## [16] {4993,6350} => {7153} 0.010441733 0.8699284   5.690520 729  
## [17] {2571,6350} => {7153} 0.009424774 0.8083538   5.287738 658  
## [18] {6350}      => {4973} 0.009711241 0.6506718   5.267544 678  
## [19] {6350,7153} => {5952} 0.010026355 0.9079118   5.177389 700  
## [20] {4993,6350} => {5952} 0.010627936 0.8854415   5.049251 742  
## [21] {5618,6350} => {7153} 0.009911768 0.7688889   5.029584 692  
## [22] {5952,6350} => {4993} 0.010627936 0.9623865   4.964165 742  
## [23] {6350,7153} => {4993} 0.010441733 0.9455253   4.877192 729  
## [24] {6350}      => {7153} 0.011043314 0.7399232   4.840109 771  
## [25] {1196,6350} => {4993} 0.009310187 0.8916324   4.599202 650  
## [26] {2571,6350} => {5952} 0.009396127 0.8058968   4.595646 656  
## [27] {260,6350}  => {4993} 0.009295863 0.8866120   4.573307 649  
## [28] {2571,6350} => {4993} 0.010198235 0.8746929   4.511825 712  
## [29] {6350}      => {3996} 0.009682594 0.6487524   4.380820 676  
## [30] {5618,6350} => {5952} 0.009840151 0.7633333   4.352927 687  
## [31] {3000,6350} => {4993} 0.009510714 0.8383838   4.324537 664  
## [32] {5618,6350} => {4993} 0.010670906 0.8277778   4.269829 745
unique(mn$rhs)
## [1] {3000} {5618} {7153} {4973} {5952} {4993} {3996}
## Levels: {3000} {3996} {4973} {4993} {5618} {5952} {7153}
mn[,c(1,3)]%>%mutate(MovieID=as.numeric(str_sub(rhs,2,-2)))->moviecode
unique(moviecode$MovieID)
## [1] 3000 5618 7153 4973 5952 4993 3996
movie%>%filter(MovieID %in% unique(moviecode$MovieID))%>%.[1:5,]
## # A tibble: 5 x 4
##   MovieID Title                             Genres                    year
##     <dbl> <chr>                             <chr>                    <dbl>
## 1    3000 Princess Mononoke (Mononoke-hime… Action|Adventure|Animat…  1997
## 2    3996 Crouching Tiger, Hidden Dragon (… Action|Adventure|Drama|…  2000
## 3    4973 Amelie (Fabuleux destin d'Amélie… Comedy|Romance            2001
## 4    4993 Lord of the Rings                 <NA>                        NA
## 5    5618 Spirited Away (Sen to Chihiro no… Adventure|Animation|Chi…  2001

5.

https://github.com/Tara1376/DA_HW.git


6.

من انتظار داشتم تئوری درس بیشتر باشد. قبول دارم که درس ، برنامه نویسی و کد زنی بود ولی جا داشت بیشتر و عمیق تر مباحث تئوری رو بررسی می کردیم. مثلا در خوشه بندی و PCA می شد عمیق تر شد و برای من جذاب تر هم می بود.

بعضی از دیتاست ها واقعا کسل کننده بودن :))) مثلا دیتاست لالیگا. تهشم نفهمیدم به چه دردی می خورد :)

قسمت های انتهایی درس، مخصوصا تمرین ۱۲ کاش بیشتر می بود. برای مباحثی مثل نقشه خیلی وقت گذاشتیم، ولی این سیستم پیشنهاد دهنده به نظرم مفیدتر و مهم تر بود.

تمرین ها خیلی هاشون گنگ بودن، و همین باعث می شد بیخودی وقت زیادی صرف یک تمرین ساده بشه.

پروژه تا الان هیچ فیدبکی نداشتیم. کاش روی پروپوزال اولیه فید بک میدادین تا الان، و با فاز یک رو نظر میدادین. اصلا نمیدونیم چه حجمی کار لازم داره. تمرین ها هم هیچ فیدبکی نداشتیم، ممکنه آدم تا آخر راه رو اشتباه بره اینجوری :)


7.

ساعت کلاس رو به ۱۰-۱۲ تغییر بدید

روی چند تا تمرین اول قبل عید فیدبک بدین، که بقیه رو درست انجام بدیم.

پروژه رو خوب میشد بعد نوشتن پروپوزال یا حداقل بعد فاز یک، یک تحویل حضوری میداشت که بفهمیم پروژه رو درست انتخاب کردیم اصلا یا نه

مباحث تئوری رو عمیق تر بخونیم توی درس و حتی یک تمرین کاملا تئوری می داشتیم بد نبود

اوایل ترم که مباحث ساده تر هستند رو سریعتر درس بدین تا اواخر ترم مباحث سخت تر و مهم تر رو وقت کافی داشته باشیم.


8.

nonlinear dimensionality reduction, Kernel PCA

linear SVM , and nonlinear SVM

Fuzzy clustring


9.

داده ای پزشکی نداشتیم. دیتاست های فراوانی با علائم بیمار و تشخیص پزشک وجود دارد.

در مباحث مربوط به انرژی دیتا ست های زیادی در باره برق و یا نیروگاه های انرژی های نو وجود دارد. چون اکثر بچه ها دانشجوی مهندسی اند جالب خواهد بود.

برای تشخیص ایمیل و پیامک اسپم چندین دیتا ست در کگل وجود دارد. به عنوان تمرین تحلیل متن این دیتاست ها هم جذابه.


10.

از یک داده میتوان به گونه ای اطلاعات را خارج کرد و یا نشان داد که برداشت فردی که خروجی را میبیند متفاوت از حقیقت باشد

از آزمون فرض در پروژه درس های دیگر هم استفاده کردم.

مدل خطی و مباحث تئوری اش

factor analysis